Detect chromosomal duplications

Author

Claudia Zirión-Martínez

Published

February 13, 2025

Setup

Libraries

Code
library(tidyverse)
library(ggtree)
library(ggtreeExtra)
library(ape)
library(ggnewscale)
library(RColorBrewer)
library(svglite)
source("scripts/metadata_colors.R")

Paths

Code
metadata_path <- 
    "data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
duplications_path <- 
    "results/tables/duplications.tsv"
merged_tree_path <- 
    "data/processed/tree_merged.newick"
tree_merged_duplications_path <- 
    "results/trees_dups/tree_merged_duplications.svg"
tree_merged_duplications_only_duplicated <-  
    "results/trees_dups/tree_merged_duplications_only_duplicated.svg"

Metadata

Load the necessary data

Code
metadata <- read.csv(
    metadata_path,
    header = TRUE)

Get one dataframe for each variable to be plotted as a separate metadata column in the tree

Code
metadata$vni_subdivision <- factor(metadata$vni_subdivision,
                            levels = c("VNIa-4", "VNIa-5", "VNIa-32", 
                            "VNIa-93", "VNIa-X", "VNIa-Y", "VNIb", 
                            "VNIc", "VNIa-outlier"))

sublineage <- metadata %>%
                filter(lineage == "VNI")%>%
                select(strain, vni_subdivision)%>%
                column_to_rownames("strain")%>%
                droplevels()
lineage <- metadata %>%
            select(strain, lineage)%>%
            column_to_rownames("strain")
dataset <- metadata %>%
            select(strain, dataset)%>%
            column_to_rownames("strain")
source <- metadata %>%
            select(strain, source)%>%
            column_to_rownames("strain")

Duplications

Code
duplications <- read.delim(
    duplications_path,
    sep = "\t", header = TRUE, stringsAsFactors = TRUE)
Code
duplications_full <- duplications %>%
    select(strain, chromosome) %>%
    distinct()

Make matrix of duplicated chromosomes

Code
dup_chroms <- duplications_full %>%
    select(strain, chromosome)%>%
    mutate(duplicated_full = 1)%>%
    arrange(chromosome)%>%
    pivot_wider(names_from = chromosome, values_from = duplicated_full, values_fill = 0)%>%
    column_to_rownames("strain")%>%
    mutate(across(everything(), ~ ifelse(. == 1, cur_column(),"Euploid")))

euploid_strain <- metadata %>%
    filter(!strain %in% duplications_full$strain)%>%
    select(strain)

for (chrom in colnames(dup_chroms)){
    euploid_strain[chrom] <- "Euploid"
}

dup_chroms <- euploid_strain %>%
    column_to_rownames("strain") %>%
    bind_rows(dup_chroms)

Tree

Code
tree <- read.tree(merged_tree_path)

Remove tips that are not in metadata$strain

Code
tree <- drop.tip(tree, setdiff(tree$tip.label, metadata$strain))

Plots

Code
chrom_colors <- brewer.pal(7, "Paired")
names(chrom_colors) <- c("chr01", "chr04",
                         "chr06", "chr09", 
                         "chr12","chr13", "chr14")
chrom_dup_colors <- c(chrom_colors, "Euploid" = "grey93")

Tree of all samples with duplications of all chromosomes

Code
m <- ggtree(tree, layout = "circular", size = 0.1, branch.length = "none") +  
        geom_tiplab(aes(label = label), size = 0.5, align =TRUE, 
                    linetype = "dashed", linesize = .03)

m1 <- gheatmap(m, dataset, width=.05, colnames=FALSE, offset=2) +
            scale_fill_manual(values = dataset_colors, name="Dataset", na.translate = FALSE)+
            guides(fill = guide_legend(order = 1))+
            new_scale_fill()

m2 <- gheatmap(m1, lineage, width=.05, colnames=FALSE, offset=4) +
            scale_fill_manual(values = lineage_colors, name="Lineage", na.translate = FALSE)+
            guides(fill = guide_legend(order = 2))+
            new_scale_fill()

m3 <- gheatmap(m2, sublineage, width=.05, colnames=FALSE, offset=6) +
            scale_fill_manual(values = sublineage_colors,
                name="VNI Sublineage",
                na.translate = FALSE,
                limits = levels(sublineage$vni_subdivision))+
            guides(fill = guide_legend(order = 3))+
            new_scale_fill()

m4 <- gheatmap(m3, source, width=.05, colnames=FALSE, offset=8) +
        scale_fill_manual(values = source_colors, name="Source", na.translate = FALSE)+
        guides(fill = guide_legend(order = 4))+
        new_scale_fill()

m5 <- gheatmap(m4, dup_chroms, width=.32, colnames = FALSE, offset=11,) +
    scale_fill_manual(values = chrom_dup_colors, name="Duplicated\nchromosomes", na.translate = FALSE )+
    guides(fill = guide_legend(order = 5))+
    theme(legend.position = "right",
        legend.direction = "vertical",
        legend.title = element_text(size=9),
        legend.text=element_text(size=7),
        legend.key.size = unit(0.3, "cm"),
        plot.margin = margin(0, 0, 0, 0, "cm"))
m5

Code
ggsave(tree_merged_duplications_path, m5, height = 7, width = 7, units = "in", dpi = 900)

Tree of all samples with duplications of chromosomes 12 and 13

Subset the duplications_full data frame to only include strains with duplications of chromosomes 12 and 13

Code
dup_chroms_12_13 <- dup_chroms %>%
    select(chr12, chr13)
Code
p5 <- gheatmap(m4, dup_chroms_12_13, width=.1, colnames = FALSE, offset=11,) +
    scale_fill_manual(values = chrom_dup_colors, name="Duplicated\nchromosomes", na.translate = FALSE )+
    guides(fill = guide_legend(order = 5))+
    theme(legend.position = "right",
        legend.direction = "vertical",
        legend.title = element_text(size=9),
        legend.text=element_text(size=7),
        legend.key.size = unit(0.3, "cm"),
        plot.margin = margin(0, 0, 0, 0, "cm"))
p5

Tree with only the samples that have duplications and the references

Code
keep_strains <- c(levels(duplications_full$strain), "H99", "Bt22", "Bt81")
tree_dups <- drop.tip(tree, setdiff(tree$tip.label, keep_strains))
sublineage <- sublineage %>%
                filter(rownames(.) %in% keep_strains)%>%
                droplevels()

Dataset, lineage, sublineage, source, duplications

Code
p <- ggtree(tree_dups, layout = "rectangular", size = 0.5, branch.length = "none") + 
    geom_tiplab(aes(label = label), size = 3, align =TRUE, 
                    linetype = "dashed", linesize = 0.1, offset = 1)

p1 <- gheatmap(p, dataset, width=0.1, colnames=FALSE, offset=8) +
    scale_fill_manual(values = dataset_colors, name="Dataset", na.translate = FALSE)+
    guides(fill = guide_legend(order = 1))+
    new_scale_fill()

p2 <- gheatmap(p1, lineage, width=0.1, colnames=FALSE, offset=9.5) +
    scale_fill_manual(values = lineage_colors, name="Lineage", na.translate = FALSE)+
    guides(fill = guide_legend(order = 2))+
    new_scale_fill()

p3 <- gheatmap(p2, sublineage, width=0.1, colnames=FALSE, offset=11) +
    scale_fill_manual(values = sublineage_colors,
        name="VNI Sublineage", na.translate = FALSE,
        limits = levels(sublineage$vni_subdivision))+
    guides(fill = guide_legend(order = 3))+
    new_scale_fill()

p4 <- gheatmap(p3, source, width=0.1, colnames=FALSE, offset=12.5) +
        scale_fill_manual(values = source_colors, name="Source", na.translate = FALSE)+
        guides(fill = guide_legend(order = 4))+
        new_scale_fill()

p5 <- gheatmap(p4, dup_chroms, width=0.7, colnames = FALSE, offset=14) +
    scale_fill_manual(values = chrom_dup_colors, name="Duplicated\nchromosomes", na.translate = FALSE )+
    guides(fill = guide_legend(order = 5))+
    theme(legend.position = "right",
        legend.direction = "vertical",
        legend.title = element_text(size=9),
        legend.text=element_text(size=7),
        legend.key.size = unit(0.3, "cm"))
p5

Lineage, sublineage, duplications

Code
p <- ggtree(tree_dups, layout = "rectangular", size = 0.5, branch.length = "none") + 
    geom_tiplab(aes(label = label), size = 3, align =TRUE, 
                    linetype = "dashed", linesize = 0.1, offset = 1)
p1 <- gheatmap(p, lineage, width=0.1, colnames=FALSE, offset=8) +
    scale_fill_manual(values = lineage_colors, name="Lineage", na.translate = FALSE)+
    guides(fill = guide_legend(order = 2))+
    new_scale_fill()

p2 <- gheatmap(p1, sublineage, width=0.1, colnames=FALSE, offset=10) +
    scale_fill_manual(values = sublineage_colors,
        name="VNI Sublineage", na.translate = FALSE,
        limits = levels(sublineage$vni_subdivision))+
    guides(fill = guide_legend(order = 3))+
    new_scale_fill()

p3 <- gheatmap(p2, dup_chroms, width=0.7, colnames = FALSE, offset=12) +
    scale_fill_manual(values = chrom_dup_colors, name="Duplicated\nchromosomes", na.translate = FALSE )+
    guides(fill = guide_legend(order = 5))+
    theme(legend.position = "right",
        legend.direction = "vertical",
        legend.title = element_text(size=9),
        legend.text=element_text(size=7),
        legend.key.size = unit(0.3, "cm"))
p3

Lineage, duplications, sublineage

Code
p <- ggtree(tree_dups, layout = "rectangular", size = 0.5, branch.length = "none") + 
    geom_tiplab(aes(label = label), size = 3, align =TRUE, 
                    linetype = "dashed", linesize = 0.1, offset = 1)

p1 <- gheatmap(p, lineage, width=0.1, colnames=FALSE, offset=8) +
    scale_fill_manual(values = lineage_colors, name="Lineage", na.translate = FALSE)+
    guides(fill = guide_legend(order = 2))+
    new_scale_fill()

p2 <- gheatmap(p1, dup_chroms, width=0.7, colnames = FALSE, offset=10) +
    scale_fill_manual(values = chrom_dup_colors, name="Duplicated\nchromosomes", na.translate = FALSE )+
    guides(fill = guide_legend(order = 2))+
    new_scale_fill()

p3 <- gheatmap(p2, sublineage, width=0.1, colnames=FALSE, offset=19.5) +
    scale_fill_manual(values = sublineage_colors,
        name="VNI Sublineage", na.translate = FALSE,
        limits = levels(sublineage$vni_subdivision))+
    guides(fill = guide_legend(order = 3))+
    theme(legend.position = "right",
        legend.direction = "vertical",
        legend.title = element_text(size=9),
        legend.text=element_text(size=7),
        legend.key.size = unit(0.3, "cm"))
p3

Lineage, duplications

VNII = 94 VNBI = 55 VNBII = 52 VNI = 57

Code
d <- data.frame(node=c(94, 55, 52, 57), lineage=c("VNI", "VNBI", "VNBII", "VNII"))

p <- ggtree(tree_dups, layout = "rectangular", size = 0.5, branch.length = "none")+ 
            geom_hilight(data=d, aes(node=node, fill=lineage), alpha = 0.5)+
            scale_fill_brewer(name = "Lineage", palette = "Dark2")+
            new_scale_fill()

p4 <- gheatmap(p, dup_chroms, width=0.5, colnames = TRUE,  colnames_position = "top",offset=0.0) +
    scale_fill_manual(values = chrom_dup_colors, name="Duplicated\nchromosomes", na.translate = FALSE )+
    guides(fill = guide_legend(order = 2))+
    new_scale_fill()+
    theme(legend.position = "bottom",
        legend.direction = "horizontal")
p4

Code
tree2<- groupClade(tree_dups, c(94, 55, 52, 57))
d <- data.frame(node=c(94, 55, 52, 57), lineage=c("VNI", "VNBI", "VNBII", "VNII"))

p <- ggtree(tree2, aes(color = group), layout = "rectangular", 
                size = 1.5, branch.length = "none")+ 
            scale_color_manual(values = c("black","#1B9E77", "#D95F02", "#7570B3", "#E7298A" ), name = "Lineage")+
            new_scale_color()

p4 <- gheatmap(p, dup_chroms, width=0.8, colnames = TRUE,  
            colnames_position = "top", 
            offset=0.0,
            font.size = 6) +
    scale_fill_manual(values = chrom_dup_colors, name="Duplicated\nchromosomes", na.translate = FALSE )+
    guides(fill = guide_legend(order = 2))+
    new_scale_fill()+
    theme(legend.position = "bottom",
        legend.direction = "horizontal",
        legend.title = element_text(size=20),
        legend.text=element_text(size=15),
        legend.key.size = unit(1, "cm"))
p4

Code
tree2<- groupClade(tree_dups, c(94, 55, 52, 57))
d <- data.frame(node=c(94, 55, 52, 57), lineage=c("VNII", "VNBI", "VNBII", "VNI"))

p <- ggtree(tree2, layout = "rectangular", 
                size = 1.5, 
                color = "gray40",
                branch.length = "none")+
        geom_cladelab(node=94, label="VNII", offset = 10.3, fontsize = 5)+
        geom_cladelab(node=55, label="VNBI", offset = 10.3, fontsize = 5)+
        geom_cladelab(node=52, label="VNBII", offset = 10.3, fontsize = 5)+
        geom_cladelab(node=57, label="VNI", offset = 10.3, fontsize = 5)

p4 <- gheatmap(p, dup_chroms, width=0.8, colnames = TRUE,  
            colnames_position = "top", 
            offset=0,
            font.size = 6) +
    scale_fill_manual(values = chrom_dup_colors, name="Duplicated\nchromosomes", na.translate = FALSE )+
    guides(fill = guide_legend(order = 2))+
    new_scale_fill()+
    theme(legend.position = "none",
        legend.direction = "horizontal",
        legend.title = element_text(size=20),
        legend.text=element_text(size=15),
        legend.key.size = unit(1, "cm"))
p4

Code
tree2<- groupClade(tree_dups, c(94, 55, 52, 57))
d <- data.frame(node=c(94, 55, 52, 57), lineage=c("VNII", "VNBI", "VNBII", "VNI"))

p <- ggtree(tree2, layout = "rectangular", 
                size = 1.5, 
                color = "gray40",
                branch.length = "none")+
            geom_text(aes(label = d$lineage[match(node, d$node)]), 
                        size = 6, , fontface = "bold",
                        hjust = 1.25, vjust = -0.5)

p4 <- gheatmap(p, dup_chroms, width=0.8, colnames = TRUE,  
            colnames_position = "top", 
            offset=0,
            font.size = 6) +
    scale_fill_manual(values = chrom_dup_colors, name="Duplicated\nchromosomes", na.translate = FALSE )+
    guides(fill = guide_legend(order = 2))+
    new_scale_fill()+
    labs(title = "Duplicated chromosomes")+
    theme(legend.position = "none",
            plot.title = element_text(size = 25))

p4

Code
ggsave(tree_merged_duplications_only_duplicated, p4, height = 8, width = 13, units = "in", dpi = 900)
Warning: Removed 91 rows containing missing values or values outside the scale range
(`geom_text()`).